# Course: 5210 Communicating Data
# Purpose: Technical Appendix of MidTerm Project
# Author: Renato Albolea, Sakshi Madan
# define default values for code chunks
knitr::opts_chunk$set(message = FALSE, dpi=300)
# Clear environment
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)
lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""),
detach, character.only = TRUE, unload = TRUE)
# Load Packages
library(tidyverse)
library(scales) #format numbers as currency
library(here) # easier way to find file path
library(kableExtra) #improved tables
library(gridExtra) # use to put graphs together in the same frame
library(ggthemes) #themes for graphs
library(GGally) #EDA Analyzis
library(qwraps2) #Nicer Summary
library(magrittr) #ables %<>%
library(janitor) #Tools for Examining and Cleaning Dirty Data
library(rcompanion) # to run pairwiseMedianTest function in the rcompanion package, which conducts Mood’s median test on all pairs of groups from one-way data
library(tools) # Apply toTitleCase function
# define the markup language we are working in.
library(colorspace)
library(plotly)
options(qwraps2_markup = "markdown")
gm_data <- read_csv('/Users/sakshi/Documents/Syllabus/Data Visualization/Midterm Project/mtp_data.csv')
#gm_data <- read_csv(here('Midterm Project','mtp_data.csv'))
gm_data %>% head() %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = T)
| UPC | iri_key | week | units | brand | flavor | package | volume | price | promo | ad |
|---|---|---|---|---|---|---|---|---|---|---|
| 00-01-16000-11653 | 644347 | 1484 | 5 | GENERAL MILLS CINNAMON TST CR | CINNAMON TOAST | BOX | 0.06 | 0.5 | 0 | A |
| 00-01-16000-11653 | 248741 | 1483 | 2 | GENERAL MILLS CINNAMON TST CR | CINNAMON TOAST | BOX | 0.06 | 0.5 | 0 | NONE |
| 00-01-16000-11653 | 535806 | 1489 | 3 | GENERAL MILLS CINNAMON TST CR | CINNAMON TOAST | BOX | 0.06 | 0.5 | 0 | NONE |
| 00-01-16000-11945 | 675634 | 1489 | 2 | GENERAL MILLS CHEERIOS | TOASTED | BOX | 0.04 | 0.5 | 0 | NONE |
| 00-01-16000-11945 | 205272 | 1491 | 8 | GENERAL MILLS CHEERIOS | TOASTED | BOX | 0.04 | 0.5 | 0 | NONE |
| 00-01-16000-11945 | 248741 | 1492 | 5 | GENERAL MILLS CHEERIOS | TOASTED | BOX | 0.04 | 0.5 | 0 | NONE |
gm_data %<>% mutate(iri_key = as.factor(iri_key),
UPC = as.factor(UPC),
promo = as.factor(promo),
flavor = as.factor(flavor),
ad = as.factor(ad),
package = as.factor(package)
)
# Separating in a bad way the brand and producer
gm_data %<>% separate(brand,"GENERAL MILLS ",into=c("aux", "flavor_GM"),remove = FALSE) %>%
separate(brand,"KELLOGGS ",into=c("aux", "flavor_KL"),remove = FALSE) %>%
separate(brand,"POST ",into=c("aux", "flavor_PT"),remove = FALSE) %>%
mutate(producer = case_when( not(is.na(flavor_GM)) ~ "General Mills",
not(is.na(flavor_KL)) ~ "Kelloggs",
not(is.na(flavor_PT)) ~ "POST",
TRUE ~ "ERROR"
),
brand = case_when(producer=="General Mills" ~ toTitleCase(tolower(flavor_GM)),
producer=="Kelloggs" ~ toTitleCase(tolower(flavor_KL)),
producer=="POST" ~ toTitleCase(tolower(flavor_PT))
),
producer = as.factor(producer),
brand = as.factor(brand),
ad = factor(ad,
levels = c("A","B","NONE"),
labels = c("Medium","Small","None")
),
revenue = units*price) %>%
select(-aux,-flavor_GM,-flavor_KL,-flavor_PT)
gm_data %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame': 21850 obs. of 13 variables:
## $ UPC : Factor w/ 114 levels "00-01-16000-11653",..: 1 1 1 2 2 2 3 3 3 3 ...
## $ iri_key : Factor w/ 1420 levels "200171","200197",..: 1041 446 1018 1217 48 446 1295 794 1184 1043 ...
## $ week : num 1484 1483 1489 1489 1491 ...
## $ units : num 5 2 3 2 8 5 6 1 4 14 ...
## $ brand : Factor w/ 15 levels "Cheerios","Cinnamon Tst Cr",..: 2 2 2 1 1 1 2 2 2 2 ...
## $ flavor : Factor w/ 5 levels "CINNAMON TOAST",..: 1 1 1 5 5 5 1 1 1 1 ...
## $ package : Factor w/ 2 levels "BOX","CUP": 1 1 1 1 1 1 2 2 2 2 ...
## $ volume : num 0.06 0.06 0.06 0.04 0.04 0.04 0.12 0.12 0.12 0.12 ...
## $ price : num 0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
## $ promo : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ad : Factor w/ 3 levels "Medium","Small",..: 1 3 3 3 3 3 3 3 3 3 ...
## $ producer: Factor w/ 3 levels "General Mills",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ revenue : num 2.5 1 1.5 1 4 2.5 6.54 1.59 6.36 14 ...
Question: Does Flavor give us any new information?
gm_data %>% group_by(brand,flavor) %>% summarise(n()) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = T)
| brand | flavor | n() |
|---|---|---|
| Cheerios | REGULAR | 4 |
| Cheerios | TOASTED | 1454 |
| Cinnamon Tst Cr | CINNAMON TOAST | 1834 |
| Cocoa Krispies | COCOA | 881 |
| Cocoa Puffs | COCOA | 1020 |
| Froot Loops | FRUIT | 2192 |
| Frosted Flakes | REGULAR | 2295 |
| Frosted Mini Wheats | REGULAR | 1574 |
| Grape Nuts | REGULAR | 1289 |
| Kix | REGULAR | 1196 |
| Lucky Charms | REGULAR | 3 |
| Lucky Charms | TOASTED | 1678 |
| Raisin Bran | REGULAR | 1266 |
| Rice Krispies | TOASTED | 1450 |
| Shredded Wheat | REGULAR | 1189 |
| Smart Start | TOASTED | 1134 |
| Special k | TOASTED | 1391 |
gm_data %>% summary()
## UPC iri_key week units
## 00-01-43000-10521: 676 656444 : 35 Min. :1479 Min. : 1.000
## 00-01-38000-01621: 666 256951 : 31 1st Qu.:1492 1st Qu.: 3.000
## 00-01-38000-00828: 660 259661 : 31 Median :1505 Median : 7.000
## 00-01-16000-27569: 639 267403 : 31 Mean :1505 Mean : 8.579
## 00-02-38000-66330: 618 652139 : 31 3rd Qu.:1518 3rd Qu.:12.000
## 00-01-38000-01611: 612 681735 : 31 Max. :1530 Max. :28.000
## (Other) :17979 (Other):21660
## brand flavor package
## Frosted Flakes : 2295 CINNAMON TOAST:1834 BOX:21306
## Froot Loops : 2192 COCOA :1901 CUP: 544
## Cinnamon Tst Cr : 1834 FRUIT :2192
## Lucky Charms : 1681 REGULAR :8816
## Frosted Mini Wheats: 1574 TOASTED :7107
## Cheerios : 1458
## (Other) :10816
## volume price promo ad
## Min. :0.040 Min. :0.250 0:17305 Medium: 1456
## 1st Qu.:0.750 1st Qu.:3.190 1: 4545 Small : 1061
## Median :1.060 Median :3.790 None :19333
## Mean :1.016 Mean :3.763
## 3rd Qu.:1.120 3rd Qu.:4.390
## Max. :4.000 Max. :9.990
##
## producer revenue
## General Mills: 7189 Min. : 0.48
## Kelloggs :12183 1st Qu.: 11.80
## POST : 2478 Median : 24.50
## Mean : 31.01
## 3rd Qu.: 44.09
## Max. :155.48
##
# Uni-variate graphical analysis of quantiative variables
grid.arrange(gm_data %>% ggplot(mapping = aes(x = week)) + geom_histogram(),
gm_data %>% ggplot(mapping = aes(x = 1, y = week)) + geom_boxplot() + coord_flip(),
ncol = 1)
- The weekly sale seems to have a pattern that could be explored in the future.
# Uni-variate graphical analysis of quantiative variables
grid.arrange(gm_data %>% ggplot(mapping = aes(x = units)) + geom_histogram(bins = max(gm_data$units)),
gm_data %>% ggplot(mapping = aes(x = 1, y = units)) + geom_boxplot() + coord_flip(),
ncol = 1)
- The number of units sold per transaction varies and follows a log normal distribution as expected.
- It would be interesting to understand better the relationship among number of units, volume, promotion, and ads for larger amounts of units in a following project.
# Uni-variate graphical analysis of quantiative variables
grid.arrange(gm_data %>% ggplot(mapping = aes(x = volume)) + geom_histogram(),
gm_data %>% ggplot(mapping = aes(x = 1, y = volume)) + geom_boxplot() + coord_flip(),
ncol = 1)
- Cereals are sold in 7 main sizes.
# Uni-variate graphical analysis of quantiative variables
grid.arrange(gm_data %>% ggplot(mapping = aes(x = price)) + geom_histogram(),
gm_data %>% ggplot(mapping = aes(x = 1, y = price)) + geom_boxplot() + coord_flip(),
ncol = 1)
- Price seems to be skewed, thus we should use median instead of mean.
# Uni-variate graphical analysis of quantiative variables
grid.arrange(gm_data %>% ggplot(mapping = aes(x = revenue)) + geom_histogram(),
gm_data %>% ggplot(mapping = aes(x = 1, y = revenue)) + geom_boxplot() + coord_flip(),
ncol = 1)
- Revenue seems to be skewed, thus we should use median instead of mean.
# Uni-variate graphical analysis of factor variables
gm_data %>% ggplot(mapping = aes(x = iri_key)) + geom_bar()
# Uni-variate graphical analysis of factor variables
gm_data %>% ggplot(mapping = aes(x = flavor)) + geom_bar()
- Regular and Toasted are the main flavors.
# Uni-variate graphical analysis of factor variables
gm_data %>% ggplot(mapping = aes(x = package)) + geom_bar()
- Consumers buy almost only boxes.
# Uni-variate graphical analysis of factor variables
gm_data %>% ggplot(mapping = aes(x = promo)) + geom_bar()
- Promo periods seems to be a good size of the data.
# Uni-variate graphical analysis of factor variables
gm_data %>% ggplot(mapping = aes(x = ad)) + geom_bar()
- Producers seems to use more promo than ads.
# Uni-variate graphical analysis of factor variables
gm_data %>% ggplot(mapping = aes(x = producer)) + geom_bar()
- Kelloggs is the main brand.
# Uni-variate graphical analysis of factor variables
gm_data %>% ggplot(mapping = aes(x = brand)) + geom_bar()
# Proportion contingency/cross table
gm_data %>%
tabyl(producer, promo) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
kable()%>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| producer | 0 | 1 | Total |
|---|---|---|---|
| General Mills | 82.2% (5909) | 17.8% (1280) | 100.0% (7189) |
| Kelloggs | 77.7% (9470) | 22.3% (2713) | 100.0% (12183) |
| POST | 77.7% (1926) | 22.3% (552) | 100.0% (2478) |
| Total | 79.2% (17305) | 20.8% (4545) | 100.0% (21850) |
# Proportion contingency/cross table
gm_data %>%
tabyl(producer, ad) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
kable()%>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| producer | Medium | Small | None | Total |
|---|---|---|---|---|
| General Mills | 6.1% (442) | 3.8% (272) | 90.1% (6475) | 100.0% (7189) |
| Kelloggs | 7.4% (903) | 5.5% (664) | 87.1% (10616) | 100.0% (12183) |
| POST | 4.5% (111) | 5.0% (125) | 90.5% (2242) | 100.0% (2478) |
| Total | 6.7% (1456) | 4.9% (1061) | 88.5% (19333) | 100.0% (21850) |
# Proportion contingency/cross table
gm_data %>%
tabyl(producer, flavor) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
kable()%>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = T)
| producer | CINNAMON TOAST | COCOA | FRUIT | REGULAR | TOASTED | Total |
|---|---|---|---|---|---|---|
| General Mills | 25.5% (1834) | 14.2% (1020) | 0.0% (0) | 16.7% (1203) | 43.6% (3132) | 100.0% (7189) |
| Kelloggs | 0.0% (0) | 7.2% (881) | 18.0% (2192) | 42.1% (5135) | 32.6% (3975) | 100.0% (12183) |
| POST | 0.0% (0) | 0.0% (0) | 0.0% (0) | 100.0% (2478) | 0.0% (0) | 100.0% (2478) |
| Total | 8.4% (1834) | 8.7% (1901) | 10.0% (2192) | 40.3% (8816) | 32.5% (7107) | 100.0% (21850) |
# Proportion contingency/cross table
gm_data %>%
tabyl(flavor, promo) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
kable()%>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| flavor | 0 | 1 | Total |
|---|---|---|---|
| CINNAMON TOAST | 83.8% (1537) | 16.2% (297) | 100.0% (1834) |
| COCOA | 74.5% (1416) | 25.5% (485) | 100.0% (1901) |
| FRUIT | 76.3% (1673) | 23.7% (519) | 100.0% (2192) |
| REGULAR | 80.0% (7057) | 20.0% (1759) | 100.0% (8816) |
| TOASTED | 79.1% (5622) | 20.9% (1485) | 100.0% (7107) |
| Total | 79.2% (17305) | 20.8% (4545) | 100.0% (21850) |
# Proportion contingency/cross table
gm_data %>%
tabyl(flavor, ad) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
kable()%>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| flavor | Medium | Small | None | Total |
|---|---|---|---|---|
| CINNAMON TOAST | 6.2% (114) | 2.8% (51) | 91.0% (1669) | 100.0% (1834) |
| COCOA | 6.7% (128) | 4.8% (91) | 88.5% (1682) | 100.0% (1901) |
| FRUIT | 7.0% (153) | 4.9% (108) | 88.1% (1931) | 100.0% (2192) |
| REGULAR | 5.9% (516) | 5.0% (439) | 89.2% (7861) | 100.0% (8816) |
| TOASTED | 7.7% (545) | 5.2% (372) | 87.1% (6190) | 100.0% (7107) |
| Total | 6.7% (1456) | 4.9% (1061) | 88.5% (19333) | 100.0% (21850) |
# Proportion contingency/cross table
gm_data %>%
tabyl(promo, ad) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
kable()
| promo | Medium | Small | None | Total |
|---|---|---|---|---|
| 0 | 3.9% (683) | 2.3% (402) | 93.7% (16220) | 100.0% (17305) |
| 1 | 17.0% (773) | 14.5% (659) | 68.5% (3113) | 100.0% (4545) |
| Total | 6.7% (1456) | 4.9% (1061) | 88.5% (19333) | 100.0% (21850) |
# Proportion contingency/cross table
gm_data %>%
tabyl(producer, flavor, promo) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns()
## $`0`
## producer CINNAMON TOAST COCOA FRUIT REGULAR
## General Mills 26.0% (1537) 13.0% (769) 0.0% (0) 17.0% (1004)
## Kelloggs 0.0% (0) 6.8% (647) 17.7% (1673) 43.6% (4127)
## POST 0.0% (0) 0.0% (0) 0.0% (0) 100.0% (1926)
## Total 8.9% (1537) 8.2% (1416) 9.7% (1673) 40.8% (7057)
## TOASTED Total
## 44.0% (2599) 100.0% (5909)
## 31.9% (3023) 100.0% (9470)
## 0.0% (0) 100.0% (1926)
## 32.5% (5622) 100.0% (17305)
##
## $`1`
## producer CINNAMON TOAST COCOA FRUIT REGULAR
## General Mills 23.2% (297) 19.6% (251) 0.0% (0) 15.5% (199)
## Kelloggs 0.0% (0) 8.6% (234) 19.1% (519) 37.2% (1008)
## POST 0.0% (0) 0.0% (0) 0.0% (0) 100.0% (552)
## Total 6.5% (297) 10.7% (485) 11.4% (519) 38.7% (1759)
## TOASTED Total
## 41.6% (533) 100.0% (1280)
## 35.1% (952) 100.0% (2713)
## 0.0% (0) 100.0% (552)
## 32.7% (1485) 100.0% (4545)
# Proportion contingency/cross table
gm_data %>%
tabyl(producer, flavor, ad) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns()
## $Medium
## producer CINNAMON TOAST COCOA FRUIT REGULAR
## General Mills 25.8% (114) 17.2% (76) 0.0% (0) 15.8% (70)
## Kelloggs 0.0% (0) 5.8% (52) 16.9% (153) 37.1% (335)
## POST 0.0% (0) 0.0% (0) 0.0% (0) 100.0% (111)
## Total 7.8% (114) 8.8% (128) 10.5% (153) 35.4% (516)
## TOASTED Total
## 41.2% (182) 100.0% (442)
## 40.2% (363) 100.0% (903)
## 0.0% (0) 100.0% (111)
## 37.4% (545) 100.0% (1456)
##
## $Small
## producer CINNAMON TOAST COCOA FRUIT REGULAR
## General Mills 18.8% (51) 19.9% (54) 0.0% (0) 20.2% (55)
## Kelloggs 0.0% (0) 5.6% (37) 16.3% (108) 39.0% (259)
## POST 0.0% (0) 0.0% (0) 0.0% (0) 100.0% (125)
## Total 4.8% (51) 8.6% (91) 10.2% (108) 41.4% (439)
## TOASTED Total
## 41.2% (112) 100.0% (272)
## 39.2% (260) 100.0% (664)
## 0.0% (0) 100.0% (125)
## 35.1% (372) 100.0% (1061)
##
## $None
## producer CINNAMON TOAST COCOA FRUIT REGULAR
## General Mills 25.8% (1669) 13.7% (890) 0.0% (0) 16.6% (1078)
## Kelloggs 0.0% (0) 7.5% (792) 18.2% (1931) 42.8% (4541)
## POST 0.0% (0) 0.0% (0) 0.0% (0) 100.0% (2242)
## Total 8.6% (1669) 8.7% (1682) 10.0% (1931) 40.7% (7861)
## TOASTED Total
## 43.8% (2838) 100.0% (6475)
## 31.6% (3352) 100.0% (10616)
## 0.0% (0) 100.0% (2242)
## 32.0% (6190) 100.0% (19333)
gm_data %>% select(week, units, price, volume) %>% cor()
## week units price volume
## week 1.00000000 -0.03405928 0.02839918 -0.01999516
## units -0.03405928 1.00000000 -0.19070782 0.02371249
## price 0.02839918 -0.19070782 1.00000000 0.54332999
## volume -0.01999516 0.02371249 0.54332999 1.00000000
There is a week correlation (0.54333) between volume and price. We need to verify if it is statistical significant.
During promotions General Mills and Kelloggs sell more Cocoa and Fruit flavors
gm_data %>% ggplot(aes(x=producer,y=price))+geom_boxplot()
gm_data %>% ggplot(aes(x=price,fill=producer))+geom_density(alpha=0.5)
gm_data %>% ggplot(aes(x=flavor,y=price))+geom_boxplot()
med_price = gm_data %>% group_by(flavor) %>% summarise(med_price = median(price))
gm_data %>% ggplot(aes(x=flavor,y=price, color=producer))+geom_boxplot()
gm_data %>% ggplot(aes(x=volume,y=price, color=producer))+geom_point() + geom_smooth(mapping = aes(color = producer), method = "lm", se = FALSE)
gm_data %>% ggplot(aes(x=volume,y=price/volume, color=producer))+geom_point() + geom_smooth(mapping = aes(color = producer), method = "loess", se = FALSE)
#gm_data %>% select(-week,-iri_key, -UPC) %>% ggpairs()
gm_data %>% select(units,flavor,promo,ad,producer,price) %>% ggpairs()
gm_data %>% ggplot(aes(x=producer,y = revenue)) + geom_boxplot()
gm_data %>% ggplot(aes(x=producer,y = revenue, color=promo)) + geom_boxplot()
gm_data %>% ggplot(aes(x=producer,y = units, color=promo)) + geom_boxplot()
gm_data %>% ggplot(aes(x=producer,y = price, color=promo)) + geom_boxplot()
gm_data %>% ggplot(aes(x=producer,y = revenue, color=ad)) + geom_boxplot()
gm_data %>% ggplot(aes(x=producer,y = units, color=ad)) + geom_boxplot()
gm_data %>% ggplot(aes(x=producer,y = price, color=ad)) + geom_boxplot()
med_ads_prod <- gm_data %>% group_by(producer,ad) %>% summarise(med_revenue = median(revenue),
n = n())
med_ads_prod %>% ggplot(aes(x = producer, y = med_revenue, fill = ad)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip()
med_promo_prod <- gm_data %>% group_by(producer,promo) %>% summarise(med_revenue = median(revenue),
n = n())
med_promo_prod %>% ggplot(aes(x = producer, y = med_revenue, fill = promo)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip()
med_ads_flavor <- gm_data %>% group_by(flavor,ad) %>% summarise(med_revenue = median(revenue),
n = n())
med_ads_flavor %>% ggplot(aes(x = flavor, y = med_revenue, fill = ad)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip()
ggsave('revenue_flavor_ads.png')
med_promo_flavor <- gm_data %>% group_by(flavor,promo) %>% summarise(med_revenue = median(revenue),
n = n())
med_promo_flavor %>% ggplot(aes(x = flavor, y = med_revenue, fill = promo)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip()
med_ads_brand <- gm_data %>% group_by(producer, brand, ad) %>% summarise(med_revenue = median(revenue),
n = n())
med_ads_brand %>% ggplot(aes(x = paste(producer,brand), y = med_revenue, fill = ad)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip()
med_promo_brand <- gm_data %>% group_by(producer, brand,promo) %>% summarise(med_revenue = median(revenue),
n = n())
med_promo_brand %>% ggplot(aes(x = paste(producer,brand), y = med_revenue, fill = promo)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip()
ggsave('revenue_brand_promo.png')
med_ads_promo_producer <- gm_data %>% group_by(producer, ad, promo) %>% summarise(med_revenue = median(revenue),
n = n())
med_ads_promo_producer %>% ggplot(aes(x = producer, y = med_revenue, fill = paste(promo,ad))) +
geom_bar(stat = 'identity', position = 'dodge') + coord_flip()
gm_data %>% ggplot(aes(x = producer, y = revenue,color=paste(promo,ad))) +
geom_boxplot() + coord_flip()
gm_data %>% ggplot(aes(x=volume)) + geom_density()+facet_wrap( ~ producer)
gm_data %>% ggplot(aes(x=volume,fill=ad)) + geom_density(alpha=0.4)+facet_grid(rows= vars(producer))
gm_data %>% ggplot(aes(x=units)) + geom_density()+facet_wrap( ~ producer)
gm_data %>% ggplot(aes(x=units,fill=ad)) + geom_density(alpha=0.4)+facet_grid(rows= vars(producer))
When we have ads, people tend to buy more units than in no ads periods.
As we saw earlier in this document since revenue is a skewed measure we should use the median instead of the mean. To test if the medians are different we will utilize the Mood’s median test.
median_test <- gm_data %>% filter(producer == "General Mills") %>% mutate(promo_ad = paste(promo,ad),
promo_ad = as.factor(promo_ad))
PT = pairwiseMedianMatrix(revenue ~ promo_ad,
data = median_test,
exact = NULL,
method = "bonferroni")
kable(PT$Adjusted) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = T)
| 0 Medium | 0 None | 0 Small | 1 Medium | 1 None | 1 Small | |
|---|---|---|---|---|---|---|
| 0 Medium | 1 | 1.000000 | 1 | 1.000000 | 1.000000 | 1 |
| 0 None | 1 | 1.000000 | 1 | 0.003402 | 0.007353 | 1 |
| 0 Small | 1 | 1.000000 | 1 | 1.000000 | 1.000000 | 1 |
| 1 Medium | 1 | 0.003402 | 1 | 1.000000 | 0.725200 | 1 |
| 1 None | 1 | 0.007353 | 1 | 0.725200 | 1.000000 | 1 |
| 1 Small | 1 | 1.000000 | 1 | 1.000000 | 1.000000 | 1 |
Thus, we can conclude that General Mills’ median revenue during promo and medium ads (median revenue =$38.25 ) is statistically different from no promo with no ads (median revenue = $28.74, p-value adjusted = 0.3402%) at 95% confidence level, and that General Mills’ median revenue, during promo and no ads(median revenue = $32.32), is also different from no promo and no ads (p-value adjusted = 0.7353%) at 95% confidence level.
According to the data set, when General Mills uses promo and medium ads, the median revenue increases around 33.1% in comparison to the base case scenario(no promo and no ads). Similarly, with promo and no ads the median revenue increases around 12.5% in comparison to the base case scenario.
Although the median revenue from the scenario of promo and medium ads is not statistically different from the case with promo and no ads (p-value adjusted = 100% ) the increase of revenue is practically significant (increase of 18.3%), thus we conclude that doing a medium ad campaign is important and should be considered to be used with promo even though the increase in revenue is not reliable.
med_ads_promo_GM <- gm_data %>% filter(producer == "General Mills") %>% group_by(ad, promo) %>% summarise(med_revenue = median(revenue),
n = n()) %>% ungroup()
sub_title = paste0("In that scenario median revenue increases ",
round(med_ads_promo_producer$med_revenue[2]/med_ads_promo_producer$med_revenue[5]-1,3)*100,
"% as compared \nto the base case of no Promo and no Ads")
med_ads_promo_GM %<>% mutate(scenario = as.factor(paste(promo,ad)),
scenario = factor(scenario,
levels = c("0 None","0 Small","0 Medium", "1 None","1 Small", "1 Medium"),
labels = c("No Promo, No Ad","No Promo, Small Ad","No Promo, Medium Ad",
"With Promo, No Ad","With Promo, Small Ad","With Promo, Medium Ad")
)
)
med_ads_promo_GM %>% ggplot(aes(x = reorder(scenario,-as.numeric(scenario)), y = med_revenue, fill = scenario)) +
geom_bar(stat = 'identity', position = 'dodge') + coord_flip() +
scale_y_continuous(labels = dollar) +
ggtitle("The highest median revenue for General Mills
is when using promo and medium ads",
sub = sub_title) +
labs(x = "", y = "Median Revenue") +
theme_economist_white(gray_bg =FALSE) + scale_colour_economist()+
scale_fill_manual(values=c("grey65", "grey65", "grey65", "grey65","grey65","grey25"))+
theme(legend.position = "none")
ggsave(filename = "General_Mills.png")
median_test_che <- gm_data %>% filter(brand == "Cheerios") %>% mutate(promo_ad = paste(promo,ad),
promo_ad = as.factor(promo_ad))
PT = pairwiseMedianMatrix(revenue ~ promo_ad,
data = median_test_che,
exact = NULL,
method = "bonferroni")
kable(PT$Adjusted) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = T)
| 0 Medium | 0 None | 0 Small | 1 Medium | 1 None | 1 Small | |
|---|---|---|---|---|---|---|
| 0 Medium | 1 | 1.0000 | 1 | 1 | 1.0000 | 1 |
| 0 None | 1 | 1.0000 | 1 | 1 | 0.1233 | 1 |
| 0 Small | 1 | 1.0000 | 1 | 1 | 1.0000 | 1 |
| 1 Medium | 1 | 1.0000 | 1 | 1 | 1.0000 | 1 |
| 1 None | 1 | 0.1233 | 1 | 1 | 1.0000 | 1 |
| 1 Small | 1 | 1.0000 | 1 | 1 | 1.0000 | 1 |
gm_data_che <- gm_data %>% filter(brand == "Cheerios") %>% mutate(promo_ad = paste(promo,ad)) %>% group_by(promo_ad) %>% summarise(med_rev = median(revenue))
For the null hypothesis of Cheerios median revenue under promo and no ads being equal to median revenue under no promo and no ads, we don’t have a strong enough evidence (p-value = 0.1233) to reject it at a confidence level of 95%. However, at a confidence level of 85%, we have sufficient evidence to reject the null hypothesis.
According to the data, Cheerios’ median revenue is 11.6% lower when in promo and no ads as compared to when not in promo and no ads, thus we conclude that General Mills should not use promos for Cheerios.
sub_title1 = paste0("The median revenue of Cheerios declines by ",-round(gm_data_che$med_rev[5] / gm_data_che$med_rev[2] - 1,3)*100,
"% when in promotion")
med_che_promo <- gm_data %>% filter(producer == "General Mills") %>% group_by(brand,promo) %>% summarise(med_revenue = median(revenue), n = n())
med_che_promo %>% ggplot(aes(x = reorder(brand,-med_revenue), y = med_revenue, fill = factor(promo, levels = rev(levels(promo))))) + geom_bar(stat = 'identity', position = "dodge") +
coord_flip() +
labs(x = "", y = "Median Revenue", title = "Cheerios is the only General Mills brand whose \nmedian revenue decreases with promotion",subtitle = sub_title1) +
scale_y_continuous(labels = dollar) +
scale_fill_manual(values = c('grey75','grey25'), name = "", labels = c("With Promo","Without Promo")) +
theme_economist_white(gray_bg =FALSE) + scale_colour_economist()+
theme(axis.ticks.y = element_blank(), legend.position = "right", legend.text = element_text(size = 9)) +
guides(fill = guide_legend(reverse = TRUE))
ggsave(filename = "Cheerios.png")
#Beginning
comparison_chart <- gm_data %>% group_by(producer, week) %>% summarise(total_rev = sum(revenue)) %>% ggplot(mapping = aes(x = week, y = total_rev, color = producer)) +
geom_line() +
labs(title = 'Total revenue generated by Kelloggs is much higher than General Mills', x = 'Week', y = 'Total Revenue', subtitle = '') +
theme_economist_white(gray_bg =FALSE) +
#theme_classic() +
theme(legend.position = 'FALSE') +
scale_color_manual(values = c('grey 25','dark red','light grey')) +
#scale_color_discrete_qualitative(palette = 'Warm') +
annotate("text", x = 1504, y = 8000, label = "Kelloggs", color = 'dark red') +
annotate("text", x = 1510, y = 5700, label = "General Mills", color = 'grey 25') +
annotate("text", x = 1510, y = 1700, label = "Post", color = 'dark grey') +
scale_y_continuous(labels = dollar)
comparison_chart
ggsave('comparison_chart.png',plot = comparison_chart)
#Middle1
med_ads_promo_GM <- gm_data %>% filter(producer == "General Mills") %>% group_by(ad, promo) %>% summarise(med_revenue = median(revenue),n =n()) %>% ungroup()
med_ads_promo_GM %<>% mutate(scenario = as.factor(paste(promo,ad)),scenario = factor(scenario,levels = c("0 None","0 Small","0 Medium", "1 None","1 Small", "1 Medium"),labels = c("No Promo, No Ad","No Promo, Small Ad","No Promo, Medium Ad","With Promo, No Ad","With Promo, Small Ad","With Promo, Medium Ad")))
general_mills_chart <- med_ads_promo_GM %>% ggplot(aes(x = reorder(scenario,-as.numeric(scenario)), y = med_revenue, fill = scenario)) +
geom_bar(stat = 'identity', position = 'dodge') +
coord_flip() +
scale_y_continuous(labels = dollar) +
ggtitle("The highest median revenue for General Mills is when using promo and medium ads", sub = 'sub_title') +
labs(x = "", y = "Median Revenue") +
theme_classic() +
#theme_economist_white(gray_bg =FALSE) + scale_colour_economist()+
scale_fill_manual(values=c("grey65", "grey65", "grey65", "grey65","grey65","grey25"))+
theme(legend.position = "none")
ggsave('general_mills_chart.png',plot = general_mills_chart)
#Middle2
med_che_promo <- gm_data %>% filter(producer == "General Mills") %>% group_by(brand,promo) %>% summarise(med_revenue = median(revenue), n = n())
cheerios_promo_chart <- med_che_promo %>% ggplot(aes(x = reorder(brand,-med_revenue), y = med_revenue, fill = factor(promo, levels = rev(levels(promo))))) + geom_bar(stat = 'identity', position = "dodge") +
coord_flip() +
labs(x = "", y = "Median Revenue", title = "Cheerios is the only General Mills brand whose \nmedian revenue decreases with promotion",subtitle = sub_title1) +
scale_y_continuous(labels = dollar) +
scale_fill_manual(values = c('grey75','grey25'), name = "", labels = c("With Promo","Without Promo")) +
theme_classic() +
#theme_economist_white(gray_bg =FALSE) + scale_colour_economist()+
theme(axis.ticks.y = element_blank(), legend.position = "right", legend.text = element_text(size = 9)) +
guides(fill = guide_legend(reverse = TRUE))
ggsave('cheerios_promo_chart.png',plot = cheerios_promo_chart)
#Middle3
med_che_ad <- gm_data %>% filter(producer == "General Mills") %>% group_by(brand,ad) %>% summarise(med_revenue = median(revenue), n = n())
cheerios_ad_chart <- med_che_ad %>% ggplot(aes(x = reorder(brand,-med_revenue), y = med_revenue, fill = ad)) + geom_bar(stat = 'identity', position = "dodge") +
coord_flip() +
labs(x = "", y = "Median Revenue", title = "Cheerios is the only General Mills brand whose \nmedian revenue decreases with ads",subtitle = sub_title1) +
scale_y_continuous(labels = dollar) +
#scale_fill_manual(values = c('grey75','grey25'), name = "", labels = c("With Promo","Without Promo")) +
theme_classic() +
#theme_economist_white(gray_bg =FALSE) + scale_colour_economist()+
theme(axis.ticks.y = element_blank(), legend.position = "right", legend.text = element_text(size = 9)) +
guides(fill = guide_legend(reverse = TRUE))
ggsave('cheerios_ad_chart.png',plot = cheerios_ad_chart)